
clear
set more off

/*********************************************************************************
Name: patients_summary_stats.do

Data In: [Data/Original/patient_surveys.dta,
		  Data/Original/patient_surveys_attempts.dta,
		  Data/Original/patient_start_dates.dta,
		  Data/Original/hw_roster.dta,
		  Data/Intermediate/verified_patients.dta
		  Data/Intermediate/center_monthly_gps_coordinates.dta]

Data Out: [Data/Intermediate/patients_summary_stats.dta,
		   Data/Intermediate/patients_summary_stats_defaults.dta]
		   
Results Out: [Results/Paper/Table2.out,
			  Results/Paper/Table5_PanelB.out,
			  Results/Appendix/TableC3_PanelB.log,
			  Results/Appendix/TableD3_PanelB.log]

Purpose of do-file: Generating descriptive statistics and conducting balance checks on patients' characteristics

Organization: PART-1: Creating the variables for the analysis
			  PART-2: Checking the balance in the characteristics of patients enrolled before the experiment across the treatment and control groups
			  PART-3: Checking the balance in the characteristics of patients enrolled after the experiment across the treatment and control groups
*********************************************************************************/

* Setting path directory
cd "${DIRECTORY}"


****************************************
*** PART-1 *** Creating the variables for the analysis
****************************************

** Calling and merging datasets

use "Data/Original/patient_surveys.dta", clear
count

merge 1:1 q1_ques_code a02_entry_exit using "Data/Original/patient_surveys_attempts.dta"
keep if _merge == 3
drop _merge

merge m:1 q1_ques_code using "Data/Original/patient_start_dates.dta"
keep if _merge == 3
drop _merge

merge m:1 Unique_ID UID_Center using "Data/Original/hw_roster.dta"
keep if _merge == 3
drop _merge

merge 1:1 q1_ques_code a02_entry_exit using "Data/Intermediate/verified_patients.dta"
keep if _merge == 3
drop _merge


** Creating the variables

* Gender
ta a1_gender
gen male = (a1_gender==1) if a1_gender ~= .

* Age
ta a3_age
replace a3_age=. if a3_age==-999

* Caste
ta a4_caste
gen caste_dont_know = (a4_caste == -999) if a4_caste ~= .
gen caste_general = (a4_caste == 1) if a4_caste ~= .
gen caste_obc = (a4_caste == 2) if a4_caste ~= .
gen caste_sc = (a4_caste == 3) if a4_caste ~= .
gen caste_st = (a4_caste == 4) if a4_caste ~= .
gen caste_minority = (a4_caste == 5) if a4_caste ~= .

* Religion
ta a5_religion
gen religion_hindu = (a5_religion == 1) if a5_religion ~= .
gen religion_muslim = (a5_religion == 2) if a5_religion ~= .
gen religion_oth = (a5_religion == -777 | a5_religion == 3 | a5_religion == 4 | a5_religion == 5) if a5_religion ~= .

* Literacy
ta a6_read_write
gen rw_both = (a6_read_write == 1) if a6_read_write ~= .
gen rw_none = (a6_read_write == 2) if a6_read_write ~= .
gen rw_onlyread = (a6_read_write == 3) if a6_read_write ~= .

* Education 
// "Other diploma" is lumped into secondary education group

ta a9_atten_sch
ta a16_educ_level

replace a16_educ_level = . if a16_educ_level == -999
list a9_atten_sch a16_educ_level if a16_educ_level ~= . & a9_atten_sch ~= 1 & a9_atten_sch ~= 2
gen  edu_belowprimary = (a9_atten_sch==3|a16_educ_level==2|a16_educ_level==3|a16_educ_level==4|a16_educ_level==5|a16_educ_level==20) if a16_educ_level!=. | (a16_educ_level == . & a9_atten_sch == 3)
gen  edu_primary=(a16_educ_level>5 & a16_educ_level<13) if  a16_educ_level!=. | (a16_educ_level == . & a9_atten_sch == 3)
gen  edu_secondary=(a16_educ_level>12 & a16_educ_level<18)|a16_educ_level==19 if  a16_educ_level!=. | (a16_educ_level == . & a9_atten_sch == 3)
gen  edu_grad=(a16_educ_level==18) if a16_educ_level!=. | (a16_educ_level == . & a9_atten_sch == 3)
gen temp = edu_belowprimary + edu_primary + edu_secondary + edu_grad
count if temp ~= 1 & temp ~= .

drop temp

* Household size
// Since the question excludes patient, adding 1 to household size

ta a20_people_live_hh a19_live_alone ,m

gen live_alone = (a19_live_alone == 1) if a19_live_alone != .
lab var live_alone "Patient lives alone"
tab live_alone,m

gen hhd_size = 1 if live_alone == 1
replace hhd_size = a20_people_live_hh + 1 if hhd_size == . & a20_people_live_hh ~= .
lab var hhd_size "Household size"

* Household assets: electricity
tab q9_have_electricity, m
gen elec = (q9_have_electricity == 1) if q9_have_electricity != . & q9_have_electricity != -999
lab var elec "Household has electricity"

* Household assets: tap water
tab q21_tap_water, m
gen tapwater = (q21_tap_water==1) if q21_tap_water != . & q21_tap_water != -999
lab var tapwater "Household has tap water"

* Household assets: TV
tab q13_tv, m
gen tv = (q13_tv == 1) if q13_tv !=. & q13_tv != -999
lab var tv "Household has TV"

* Household assets: refrigerator
tab q11_refrigerator, m
gen fridge = (q11_refrigerator == 1) if q11_refrigerator != . & q11_refrigerator != -999
lab var fridge "Household has refrigerator"
 
* Household assets: own house
tab q19_own_house, m
gen ownhouse = (q19_own_house == 1) if q19_own_house != . & q19_own_house != -999
lab var ownhouse "Household owns the house"

* Migration
replace a22_live_neigh=. if  a22_live_neigh==-999
ta a22_live_neigh

gen migrate_always = (a22_live_neigh == 1) if a22_live_neigh ~= .
gen migrate_6plus = (a22_live_neigh == 2 | a22_live_neigh == 3) if a22_live_neigh ~= .
gen migrate_lessthan5 = (a22_live_neigh == 4 | a22_live_neigh == 5) if a22_live_neigh ~= .

* Employment status
tab c0_current_working, m
gen current_working = (c0_current_working == 1) if c0_current_working != . & c0_current_working != -999
lab var current_working "Patient currently working (includes self employment and regular wage)"

* Checking the treatment start date is same for each patient on all surveys
bys q1_ques_code: egen temp = nvals(treatment_start_date)
assert temp == 1
drop temp

* Creating month of treatment start and MonthIntoExp for treatment start month
gen treatment_start_month = ym(year(treatment_start_date), month(treatment_start_date))
gen MonthIntoExp = treatment_start_month - expstartdate_m + 1

// Note that each patient is associated to one center and one center only even if she transfers from one center to another: the center that she was allocated to initially
// Also note that the gps location was taken in the first page of the survey, including, when possible, for patients that were not surveyed (when the surveyor went to their address)

merge m:1 UID_Center MonthIntoExp using "Data/Intermediate/center_monthly_gps_coordinates.dta",  gen(_mergeNEWGPS)

drop if _mergeNEWGPS==2
drop _merge*

* Cleaning GPS coordinates
foreach x of varlist a0_1_gps_east3 a0_1_gps_east2 a0_1_gps_east1{
	replace `x'=. if `x'==-111 
	replace `x'=. if (a0_1_gps_east1 < 70 | a0_1_gps_east1>87) & a0_1_gps_east1!=. 
}

foreach x of varlist a0_1_gps_north3 a0_1_gps_north2 a0_1_gps_north1{
	replace `x'=. if `x'==-111 
	replace `x'=. if (a0_1_gps_north1 <8 | a0_1_gps_north1> 33) & a0_1_gps_north1!=. 
}

mvdecode a0_1_gps_north1 a0_1_gps_north2 a0_1_gps_north3 a0_1_gps_east1 a0_1_gps_east2 a0_1_gps_east3, mv(-888 -999 -111)

* Patient's GPS (lat):
egen gps_lat_min=concat(a0_1_gps_north2 a0_1_gps_north3), punc(".")
destring gps_lat_min, replace force
replace gps_lat_min= gps_lat_min/60

gen gps_lat=a0_1_gps_north1+gps_lat_min

* Patient's GPS (long):
egen gps_long_min=concat(a0_1_gps_east2 a0_1_gps_east3), punc(".")
destring gps_long_min, replace force
replace gps_long_min= gps_long_min/60

gen gps_long=a0_1_gps_east1+gps_long_min

* Generating a variable that calculates the distance between the patient and the center
geodist gps_lat gps_long centergpslat centergpslong, gen(dist_pat_center) mile

* Generating a variable that winsorizes the 5% of the upper outliers.
winsor dist_pat_center, gen(dist_center_winsor) p(0.05) highonly 

* Time to center
replace  i14_asha_counselor1=. if  i14_asha_counselor1==-999

save "Data/Intermediate/patients_summary_stats.dta", replace 


****************************************
*** PART-2 *** Checking the balance in the characteristics of patients enrolled before the experiment across the treatment and control groups
****************************************

use "Data/Intermediate/patients_summary_stats.dta", clear

* Creating list of variables to measure education level of patients
global bl_education = "rw_none rw_onlyread rw_both edu_belowprimary edu_primary edu_secondary edu_grad"
 
* Creating list of variables to evaluate social background of patients
global bl_social "male a3_age caste_dont_know caste_general caste_obc caste_sc caste_st caste_minority religion_hindu religion_muslim religion_oth"

* Creating list of variables to evaluate household background of patients
global bl_household "hhd_size live_alone elec tapwater tv fridge ownhouse migrate_always migrate_6plus migrate_lessthan5 i14_asha_counselor1 dist_center_winsor current_working"

* Keeping only one survey per patient and only verified patients 
keep if first_complete == 1 & verified_pat == 1

* Regressions: Table 2

gen var=""
for any diff p control_mean control_sd treatment_mean treatment_sd fullsample_mean fullsample_sd N treatment_control: gen X=.

preserve
keep if post_exp == 0

#delimit;
local vars= 31;

for any $bl_social $bl_education $bl_household
\ num 1/`vars': 
replace var="X" if _n==Y \ 
ivreg2 X treatment final_stratum_id1-final_stratum_id13 entry_survey if post_exp == 0, cl(uid_cluster) small \ 
mat beta=e(b) \ 
replace diff=beta[1,1] if _n==Y \ 
test treatment=0 \ 
replace p=r(p) if _n==Y \ 
sum X if treatment==0 & post_exp == 0 \ 
replace control_mean=r(mean) if _n==Y \ 
replace control_sd=r(sd) if _n==Y \ 
sum X if treatment==1 & post_exp == 0 \ 
replace treatment_mean=r(mean) if _n==Y \ 
replace treatment_sd=r(sd) if _n==Y\
replace treatment_control = treatment_mean - control_mean\
sum X if treatment==0 | treatment == 1 \ 
replace N=r(N) if _n==Y \ 
replace fullsample_mean=r(mean) if _n==Y \ 
replace fullsample_sd=r(sd) if _n==Y;
for var diff-fullsample_sd: replace X=round(X, 0.001);
outsheet var fullsample_mean fullsample_sd N control_mean control_sd treatment_mean treatment_sd treatment_control p if _n<=`vars' using "Results/Paper/Table2.out", replace;
#delimit cr

restore

* Joint orthogonality test on characteristics predicting default (as identified in defaults.do): reported in Section 3.3

ivreg2 treatment male dist_center_winsor final_stratum_id1-final_stratum_id13 if post_exp == 0, cl(uid_cluster) small
test male dist_center_winsor


****************************************
*** PART-3 *** Checking the balance in the characteristics of patients enrolled after the experiment across the treatment and control groups
****************************************

use "Data/Intermediate/patients_summary_stats.dta", clear

* Keeping only one survey per patient, only verified patients, and only patients detected after randomization start date
keep if first_complete == 1 & verified_pat == 1 & post_exp == 1

* Regressions: Table 5, Panel B

gen var=""
for any diff p control_mean control_sd treatment_mean treatment_sd fullsample_mean fullsample_sd N treatment_control: gen X=.

#delimit;
local vars= 31;

for any $bl_social $bl_education $bl_household
\ num 1/`vars':
replace var="X" if _n==Y \ 
ivreg2 X treatment final_stratum_id1-final_stratum_id13 entry_survey, cl(uid_cluster) small\ 
replace N=e(N) if _n==Y \ 
mat beta=e(b) \ 
replace diff=beta[1,1] if _n==Y \ 
test treatment=0 \ 
replace p=r(p) if _n==Y \ 
sum X if treatment==0\ 
replace control_mean=r(mean) if _n==Y \ 
replace control_sd=r(sd) if _n==Y \ 
sum X if treatment==1\ 
replace treatment_mean=r(mean) if _n==Y \ 
replace treatment_sd=r(sd) if _n==Y \
replace treatment_control = treatment_mean - control_mean;
for var diff-fullsample_sd: replace X=round(X, 0.001);
outsheet var control_mean control_sd treatment_mean treatment_sd treatment_control p N if _n<=`vars' using "Results/Paper/Table5_PanelB.out", replace;
#delimit cr

* Wild cluster bootstrap and pairs cluster bootstrap: Table C3, Panel B, and Table D3, Panel B

* Wild cluster bootstrap

log using "Results/Appendix/TableC3_PanelB.log", replace
foreach var of varlist $bl_social $bl_education $bl_household{
wildbootstrap regress `var' treatment final_stratum_id1-final_stratum_id13 entry_survey, cluster(uid_cluster) reps(5000) rseed(13915183)
}
log close

* Pairs cluster bootstrap

log using "Results/Appendix/TableD3_PanelB.log", replace
foreach var of varlist $bl_social $bl_education $bl_household{
clustse regress `var' treatment final_stratum_id1-final_stratum_id13 entry_survey, cluster(uid_cluster) method(pairs) reps(5000) seed(13915183)
}
log close


** Preparing dataset for merging in defaults.do

use "Data/Intermediate/patients_summary_stats.dta", clear

keep q1_ques_code a02_entry_exit $bl_social $bl_education $bl_household

save "Data/Intermediate/patients_summary_stats_defaults.dta", replace
